library(dplyr)
library(ggplot2)
infn = "../data/human.csv"
metadata = read.table(infn, sep=",", header=TRUE)
metadata$condition = ifelse(grepl("Healthy", metadata$What), "HC",
ifelse(grepl("NASH", metadata$What), "OSA",
ifelse(grepl("Steat", metadata$What), "SS", "NAFLD")))
metadata$condition = as.factor(metadata$condition)
metadata$spiked = ifelse(grepl("Spike", metadata$Spike.In), "spike", "nospike")
metadata$column = ifelse(grepl("ymo", metadata$Column), "zymo",
ifelse(grepl("None", metadata$Column), "none", "other"))
metadata$id = paste(metadata$Full.Sample.Name, metadata$column, metadata$spiked, metadata$condition, sep="_")
set.seed(12345)
library(NanoStringQCPro)
rccFiles = paste("../data/rcc/", metadata[,2], sep="")
eset = newRccSet(rccFiles=rccFiles)
## Reading RCC files...
## checkRccSet() messages:
## Optional featureData cols not found: BarCode, ProbeID, TargetSeq, Species, Comments
## The following panel housekeeping genes were found: B2M|0, GAPDH|0, RPL19|0, ACTB|0, RPLP0|0
## Warning in .local(rccSet, ...):
## Non-standard CodeClass values found in featureData (values currently recognized are "Endogenous", "Housekeeping", "Positive", and "Negative"): Ligation, Endogenous1, SpikeIn)
colnames(eset) = metadata$id
A small set of miRNA have known inflated values; Nanostring has a heuristic to correct for the inflation via the formula using some supplied constants. Here we implement that fix to correct the expression of those miRNA.
correct_miRNA = function(eset) {
require(dplyr)
fdat = fData(eset)
fdat = fdat %>%
tidyr::separate(GeneName, c("Name", "scalefactor"), sep='\\|',
fill="right", remove=FALSE) %>%
dplyr::mutate(scalefactor=ifelse(is.na(scalefactor), 0, scalefactor))
sf = as.numeric(fdat$scalefactor)
posa = as.numeric(exprs(eset)["Positive_POS_A_ERCC_00117.1",])
scales = t(replicate(length(sf), posa))
scales = scales * sf
scaled = exprs(eset) - scales
scaled[scaled<0] = 0
return(round(scaled))
}
plot_corrected_miRNA = function(scaled, eset) {
require(reshape)
require(ggplot2)
counts = exprs(eset)
rownames(counts) = rownames(scaled)
is_scaled = rowSums(abs(counts - scaled)) > 0
counts = melt(as.matrix(counts[is_scaled,]))
colnames(counts) = c("gene", "sample", "value")
counts$scaled = "no"
scaled = melt(as.matrix(scaled[is_scaled,]))
colnames(scaled) = c("gene", "sample", "value")
scaled$scaled = "yes"
all = rbind(counts, scaled)
library(cowplot)
ggplot(all, aes(sample, value, color=scaled)) + facet_wrap(~gene) +
geom_point(size=0.5) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
guides(colour = guide_legend(override.aes = list(size=6))) + ylab("") +
ggtitle("miRNA hybridization crosstalk correction")
}
scaled = correct_miRNA(eset)
plot_corrected_miRNA(scaled, eset)
## Loading required package: reshape
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
counts = as.data.frame(scaled)
colnames(counts) = metadata$id
This percentage should be super high, close to 100%. If there isn’t that usually means the cartridge needs to be rescanned. If it is < 3 weeks from the scan, rescanning should be okay. Nanostring folks call < 75% a failure. These all look fine.
library(cowplot)
pdat = pData(eset) %>%
tibble::rownames_to_column() %>%
left_join(metadata, by=c("rowname"="id"))
pdat$pcounted = pdat$FovCounted/pdat$FovCount * 100
ggplot(pdat, aes(rowname, pcounted)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(pdat$pcounted))) +
ylab("percentage of FOV counted") + xlab("sample") +
geom_hline(yintercept=75, color="red")
Binding density looks ok.
ggplot(pdat, aes(rowname, BindingDensity)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(pdat$BindingDensity))) +
ylab("Binding density") + xlab("sample") +
geom_hline(yintercept=0.05, color="red") +
geom_hline(yintercept=2.25, color="red")
Here I plotted the total counts vs the number of miRNA detected, where detected is it has counts > 30. There is a pretty big spread among the samples in terms of the miRNA detected. The Zymo columns seem to all have a low number of miRNA detected.
endocounts = counts[grepl("Endo", rownames(counts)),]
cdf = data.frame(total=colSums(counts), detected=colSums(counts > 30))
rownames(cdf) = colnames(counts)
cdf$id = rownames(cdf)
cdf = cdf %>% left_join(metadata, by="id")
ggplot(cdf, aes(total, detected, color=column, label=condition)) +
geom_text()
library(ggplot2)
library(dplyr)
library(cowplot)
is_positive = function(column) {
return(grepl("Pos", column))
}
is_negative = function(column) {
return(grepl("Neg", column))
}
is_spikein = function(column) {
return(grepl("Spike", column))
}
is_ligation = function(column) {
return(grepl("Ligati", column))
}
is_housekeeping = function(column) {
return(grepl("Housekee", column))
}
is_prior = function(column) {
return(grepl("miR-159", column) | grepl("miR-248", column) |
grepl("miR-254", column))
}
extract_pred = function(counts, predicate) {
toplot = counts[predicate(rownames(counts)),] %>%
tibble::rownames_to_column() %>%
tidyr::gather("sample", "count", -rowname)
colnames(toplot) = c("spot", "sample", "count")
toplot = toplot %>% left_join(metadata, by=c("sample"="id"))
return(toplot)
}
spotbarplot = function(toplot) {
ggplot(toplot,
aes(sample, count)) + geom_bar(stat='identity') +
facet_wrap(~spot) +
theme(axis.text.x = element_blank(),
text = element_text(size=8))
}
spotboxplot = function(toplot) {
ggplot(toplot,
aes(linehypo, count)) + geom_boxplot() +
facet_wrap(~spot) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
Below we look at the R^2 correlation between the expected positive control concentrations and the observed concentrations for each sample.
A set of samples have a lower correlation than the other samples.
spotbarplot(extract_pred(counts, is_positive))
pcdf = data.frame(concentration=log2(c(128, 32, 8, 2, 0.5, 0.125)),
GeneName=paste("POS", c("A", "B", "C", "D", "E", "F"), sep="_"))
pccounts = subset(exprs(eset), grepl("Positive_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
function(x) cor(x, pcdf$concentration)),
sample=colnames(pccounts)) %>%
left_join(metadata, by=c("sample"="id"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
ggplot(corsamples, aes(sample, correlation)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
ylab("positive control correlation") +
xlab("sample")
Those are all Zymo columns. 8 of the 10 Zymo columns have this problem. If it looks like they have other issues, it might be best to just exclude them.
subset(corsamples, correlation < 0.80)$sample
## [1] "179ON_zymo_spike_OSA" "258ON_zymo_spike_OSA" "035ON_zymo_spike_OSA"
## [4] "448ON_zymo_spike_OSA" "048SS_zymo_spike_SS" "037ss_zymo_spike_SS"
## [7] "163SS_zymo_spike_SS" "114SS_zymo_spike_SS"
We can see some samples have a higher negative control count than the other samples.
spotbarplot(extract_pred(counts, is_negative))
knitr::kable(subset(extract_pred(counts, is_negative), count > 50))
| spot | sample | count | File.Name | FULL.RCC.File.Name | Pure.Sample.Name | Full.Sample.Name | Clinical.Category | What | Column | Spike.In | Notes | Date.Shipped | RCC.File.Date | condition | spiked | column | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | |||||||||||||||||
| 17 | Negative_NEG_C_ERCC_00019.1 | T1N_none_nospike_HC | 75 | ..03 | 20150421_PHS 1_01_03.RCC | T1 | T1N | Healthy Control | Healthy control 1 -vial 1 | None | None | optimization | 4.6.15 | 4.21.2015 | HC | nospike | none |
| 25 | Negative_NEG_C_ERCC_00019.1 | T1A_other_nospike_HC | 84 | ..04 | 20150421_PHS 1_01_04.RCC | T1 | T1A | Healthy Control | Healthy control 1 -vial 1 | Amicon | None | optimization | 4.6.15 | 4.21.2015 | HC | nospike | other |
| NA | |||||||||||||||||
| NA | |||||||||||||||||
| NA | |||||||||||||||||
| NA |
We’ll normalize the libraries and look at the negative control expression and then come up with a cutoff that is above the noise floor for the libraries. It looks like 30 is a reasonable noise floor to use.
drop_unusable = function(counts) {
drop = is_spikein(rownames(counts))
drop = drop | is_positive(rownames(counts))
drop = drop | is_housekeeping(rownames(counts))
drop = drop | is_ligation(rownames(counts))
keep = counts[!drop,]
keep = keep[, !grepl("Blank", colnames(keep))]
return(keep)
}
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:reshape':
##
## expand, rename
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, regroup, slice
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
dds = DESeqDataSetFromMatrix(drop_unusable(counts), colData=metadata,
design=~spiked+column+condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = estimateSizeFactors(dds)
ncounts = data.frame(counts(dds, normalized=TRUE))
ggplot(extract_pred(ncounts, is_negative), aes(count)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
nfloor = 30
We don’t have many spike in sets in this sample either, so we can’t really use the spike ins.
spotbarplot(extract_pred(counts, is_spikein))
knitr::kable(subset(extract_pred(counts, is_spikein), count > 100))
| spot | sample | count | File.Name | FULL.RCC.File.Name | Pure.Sample.Name | Full.Sample.Name | Clinical.Category | What | Column | Spike.In | Notes | Date.Shipped | RCC.File.Date | condition | spiked | column | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| NA | |||||||||||||||||
| 113 | SpikeIn_cel-miR-254|0_MIMAT0000310 | A1_none_spike_HC | 305 | ..07 | 20160114_ES011416miRNAV3a_Stock sampleA1_07.RCC | A1 | A1 | Healthy Control | Healthy control 4 - vial 1 | None | +Spike In | optimization | 1.4.16 | 1.14.16 | HC | spike | none |
| 118 | SpikeIn_cel-miR-254|0_MIMAT0000310 | A10_zymo_spike_HC | 201 | ..08 | 20160114_ES011416miRNAV3a_Stock sampleA2O_08.RCC | A10 | A10 | Healthy Control | Healthy control 4 -vial 10 | Zymo | +Spike In | optimization | 1.4.16 | 1.14.16 | HC | spike | zymo |
Some of the ligation controls are drastically different between samples.
spotbarplot(extract_pred(counts, is_ligation))
Here we look at R^2 for the ligation controls as well. It looks like one sample looks bad. It is a Zymo column.
pcdf = data.frame(concentration=log2(c(128, 32, 8)),
GeneName=paste("POS_", c("A", "B", "C"), sep="_"))
pccounts = subset(exprs(eset), grepl("Ligation_LIG_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
function(x) cor(x, pcdf$concentration)),
sample=colnames(pccounts)) %>%
left_join(metadata, by=c("sample"="id"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
ggplot(corsamples, aes(sample, correlation, color=column)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
ylab("ligation control correlation") +
xlab("sample")
Here we calculated ligation efficiency for the samples. This is the log of LIG_POS_A(i) / mean(LIG_POS_A) for each sample i. It doesn’t matter which direction this is in, it will get corrected for in the model. Again we can see most of the Zymo columns are outliers in this.
lignorm = function(counts, feature="Ligation_LIG_POS_A") {
ligcounts = as.numeric(counts[grepl(feature, rownames(counts)),])
lnorm = log2((ligcounts + 1) / mean(ligcounts))
names(lnorm) = colnames(counts)
return(lnorm)
}
metadata$lignorm = lignorm(counts)
ggplot(metadata, aes(id, lignorm, color=column)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
ylab("ligation factor") +
xlab("sample")
Some housekeeping genes are expressed more highly in some samples. Not sure what we are supposed to do with this information, if there are any suggestions we could easily implement it.
spotbarplot(extract_pred(counts, is_housekeeping))
drop_unusable = function(counts) {
drop = is_spikein(rownames(counts))
drop = drop | is_positive(rownames(counts))
drop = drop | is_housekeeping(rownames(counts))
drop = drop | is_ligation(rownames(counts))
keep = counts[!drop,]
keep = keep[, !grepl("Blank", colnames(keep))]
return(keep)}
counts = drop_unusable(counts)
metadata = subset(metadata, id %in% colnames(counts))
Here we fit two models. The first model we fit the full model, including the ligation normalization term in the model. ~month+column+line+hypoxia+lignorm. Then we fit a reduced model without the lignorm term to answer the question ‘is there miRNA that are affected by the ligation efficiency?’
Yes, it looks like even after normalization, if we fit a model with just the ligation efficiency that there are miRNA affected. So we should correct for that. Here you can see only a subset of the miRNA are affected; if we do it like this instead of just scaling the counts, we can correct for the specific miRNA that have counts that are correlated with the ligation efficiency.
full = ~spiked+column+lignorm+condition
reduced = ~spiked+column+condition
dds = DESeqDataSetFromMatrix(counts, colData=metadata,
design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds, full=full, reduced=reduced, test="LRT")
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
res = results(dds)
plotMA(res)
res = data.frame(res) %>% tibble::rownames_to_column() %>%
arrange(padj) %>% filter(padj < 0.05)
knitr::kable(subset(res, padj < 0.05))
| rowname | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|
| Endogenous1_hsa-miR-3144-3p|0.004_MIMAT0015015 | 16.154394 | -0.9027776 | 0.7035563 | 24.617774 | 0.0000007 | 0.0001418 |
| Endogenous1_hsa-miR-520f-3p|0_MIMAT0002830 | 22.485535 | 0.2807007 | 0.3115951 | 25.039419 | 0.0000006 | 0.0001418 |
| Endogenous1_hsa-miR-3065-5p|0.004_MIMAT0015066 | 15.168313 | -0.7369420 | 0.7020276 | 24.318401 | 0.0000008 | 0.0001418 |
| Endogenous1_hsa-miR-1246|0.005_MIMAT0005898 | 116.492298 | -1.2478732 | 1.0380751 | 20.863521 | 0.0000049 | 0.0005572 |
| Endogenous1_hsa-miR-128-1-5p|0_MIMAT0026477 | 12.508796 | 0.5570126 | 0.3300654 | 20.687951 | 0.0000054 | 0.0005572 |
| Endogenous1_hsa-miR-455-5p|0_MIMAT0003150 | 14.969105 | 0.0845011 | 0.3722611 | 20.359475 | 0.0000064 | 0.0005572 |
| Endogenous1_hsa-miR-5010-3p|0_MIMAT0021044 | 17.040506 | 0.0620533 | 0.4116074 | 19.903198 | 0.0000081 | 0.0006063 |
| Endogenous1_hsa-miR-302d-3p|0.006_MIMAT0000718 | 25.431722 | -0.9682754 | 0.9945260 | 19.132930 | 0.0000122 | 0.0007940 |
| Endogenous1_hsa-miR-494-3p|0.099_MIMAT0002816 | 223.791055 | 3.6463676 | 2.3874095 | 17.931783 | 0.0000229 | 0.0013255 |
| Endogenous1_hsa-miR-369-3p|0_MIMAT0000721 | 21.623242 | 0.4177668 | 0.3211437 | 16.698079 | 0.0000438 | 0.0022833 |
| Endogenous1_hsa-miR-652-5p|0_MIMAT0022709 | 9.151777 | -0.1158272 | 0.4449402 | 15.872030 | 0.0000678 | 0.0032100 |
| Endogenous1_hsa-miR-4536-3p|0_MIMAT0020959 | 16.773122 | 0.3668208 | 0.3519415 | 15.640717 | 0.0000766 | 0.0033252 |
| Endogenous1_hsa-miR-301a-3p|0_MIMAT0000688 | 17.751937 | 0.1110325 | 0.3778992 | 14.534513 | 0.0001376 | 0.0055152 |
| Endogenous1_hsa-miR-574-5p|0_MIMAT0004795 | 28.637916 | 0.0585036 | 0.3650492 | 14.110083 | 0.0001724 | 0.0064164 |
| Endogenous1_hsa-miR-137|0_MIMAT0000429 | 18.632380 | 0.7748520 | 0.3287320 | 13.656989 | 0.0002194 | 0.0076213 |
| Endogenous1_hsa-miR-4485-3p|0_MIMAT0019019 | 16.053723 | 0.2896278 | 0.3477983 | 13.495628 | 0.0002391 | 0.0077863 |
| Endogenous1_hsa-miR-526b-5p|0_MIMAT0002835 | 18.250371 | 0.5268700 | 0.3932662 | 12.731418 | 0.0003596 | 0.0110196 |
| Endogenous1_hsa-miR-449b-5p|0_MIMAT0003327 | 17.239821 | 0.5104381 | 0.3297762 | 12.596221 | 0.0003865 | 0.0111878 |
| Endogenous1_hsa-miR-532-3p|0_MIMAT0004780 | 20.808011 | 0.3421212 | 0.3604880 | 12.254842 | 0.0004641 | 0.0121706 |
| Endogenous1_hsa-miR-302a-3p|0_MIMAT0000684 | 15.481374 | 0.2836165 | 0.4152721 | 12.242215 | 0.0004672 | 0.0121706 |
| Endogenous1_hsa-miR-1228-3p|0_MIMAT0005583 | 20.384874 | 0.6922482 | 0.3517373 | 12.127399 | 0.0004969 | 0.0123270 |
| Endogenous1_hsa-miR-508-5p|0_MIMAT0004778 | 15.773746 | 0.4023408 | 0.3453795 | 11.915263 | 0.0005568 | 0.0126118 |
| Endogenous1_hsa-miR-5001-5p|0_MIMAT0021021 | 17.611257 | 0.4878175 | 0.3718612 | 11.930862 | 0.0005521 | 0.0126118 |
| Negative_NEG_G_ERCC_00144.1 | 14.468579 | 0.6237302 | 0.3667778 | 11.794852 | 0.0005939 | 0.0128936 |
| Endogenous1_hsa-miR-2682-5p|0_MIMAT0013517 | 21.076116 | -0.3665701 | 0.3835922 | 11.650844 | 0.0006417 | 0.0133737 |
| Endogenous1_hsa-miR-3613-3p|0_MIMAT0017991 | 17.943783 | 0.3191609 | 0.3342182 | 11.308388 | 0.0007716 | 0.0148886 |
| Endogenous1_hsa-miR-199a-5p|0_MIMAT0000231 | 20.873884 | 0.2023784 | 0.3238180 | 11.342755 | 0.0007574 | 0.0148886 |
| Endogenous1_hsa-miR-184|0_MIMAT0000454 | 19.362627 | 0.6129380 | 0.3778002 | 11.102818 | 0.0008620 | 0.0154857 |
| Endogenous1_hsa-miR-6720-3p|0_MIMAT0025851 | 12.246648 | 0.4268708 | 0.4125961 | 11.137908 | 0.0008458 | 0.0154857 |
| Endogenous1_hsa-miR-525-5p|0_MIMAT0002838 | 24.296625 | 0.4786314 | 0.3128564 | 10.909131 | 0.0009569 | 0.0166184 |
| Endogenous1_hsa-miR-30c-5p|0_MIMAT0000244 | 16.472750 | -0.0428976 | 0.3374711 | 10.499707 | 0.0011939 | 0.0172821 |
| Endogenous1_hsa-miR-373-3p|0_MIMAT0000726 | 14.094550 | -0.1613932 | 0.3455254 | 10.513527 | 0.0011850 | 0.0172821 |
| Endogenous1_hsa-miR-338-5p|0_MIMAT0004701 | 16.282134 | 0.3563631 | 0.3716245 | 10.504735 | 0.0011907 | 0.0172821 |
| Endogenous1_hsa-miR-522-3p|0_MIMAT0002868 | 21.037217 | 0.5356803 | 0.3474974 | 10.499366 | 0.0011942 | 0.0172821 |
| Endogenous1_hsa-miR-1268b|0_MIMAT0018925 | 21.962312 | 0.3599242 | 0.2869007 | 10.674698 | 0.0010861 | 0.0172821 |
| Endogenous1_hsa-miR-487b-3p|0_MIMAT0003180 | 11.091303 | 0.4837423 | 0.4062637 | 10.616317 | 0.0011209 | 0.0172821 |
| Endogenous1_hsa-miR-543|0_MIMAT0004954 | 18.524290 | 0.6713847 | 0.3044068 | 10.352965 | 0.0012927 | 0.0181361 |
| Endogenous1_hsa-miR-27b-3p|0_MIMAT0000419 | 11.442824 | 0.4726154 | 0.3916176 | 10.310460 | 0.0013228 | 0.0181361 |
| Endogenous1_hsa-miR-4286|0_MIMAT0016916 | 21.179109 | 0.4028977 | 0.3281633 | 10.107572 | 0.0014766 | 0.0197261 |
| Endogenous1_hsa-miR-197-5p|0_MIMAT0022691 | 14.781771 | 0.4860745 | 0.3292944 | 9.994336 | 0.0015702 | 0.0204522 |
| Endogenous1_hsa-miR-34c-3p|0_MIMAT0004677 | 20.676202 | 0.0706558 | 0.2792556 | 9.758553 | 0.0017849 | 0.0221413 |
| Endogenous1_hsa-miR-421|0_MIMAT0003339 | 17.491647 | 0.1690349 | 0.3423553 | 9.779140 | 0.0017650 | 0.0221413 |
| Endogenous1_hsa-miR-4647|0_MIMAT0019709 | 18.236624 | 0.4144892 | 0.3341588 | 9.707018 | 0.0018357 | 0.0222413 |
| Endogenous1_hsa-miR-381-3p|0_MIMAT0000736 | 13.695175 | -0.0460788 | 0.3540132 | 9.506826 | 0.0020471 | 0.0242394 |
| Endogenous1_hsa-miR-3614-5p|0_MIMAT0017992 | 19.228691 | 0.5946727 | 0.3416812 | 9.309471 | 0.0022797 | 0.0263941 |
| Endogenous1_hsa-miR-572|0_MIMAT0003237 | 13.253703 | 0.0329145 | 0.3920173 | 9.225713 | 0.0023864 | 0.0270285 |
| Endogenous1_hsa-miR-211-5p|0_MIMAT0000268 | 19.204195 | 0.7713678 | 0.3405613 | 9.168648 | 0.0024620 | 0.0272911 |
| Endogenous1_hsa-miR-382-5p|0_MIMAT0000737 | 14.964241 | -0.4628502 | 0.3591842 | 8.994089 | 0.0027085 | 0.0289324 |
| Endogenous1_hsa-miR-6503-5p|0_MIMAT0025462 | 14.494007 | 0.7332092 | 0.3466901 | 8.985646 | 0.0027211 | 0.0289324 |
| Endogenous1_hsa-miR-1206|0_MIMAT0005870 | 16.661578 | -0.1488565 | 0.3389593 | 8.881072 | 0.0028814 | 0.0292747 |
| Endogenous1_hsa-miR-3161|0_MIMAT0015035 | 8.770930 | 0.0764910 | 0.4115240 | 8.856163 | 0.0029210 | 0.0292747 |
| Endogenous1_hsa-miR-10a-5p|0_MIMAT0000253 | 19.430113 | 0.5751018 | 0.3130499 | 8.855635 | 0.0029219 | 0.0292747 |
| Endogenous1_hsa-miR-891a-5p|0_MIMAT0004902 | 22.442521 | 0.3218974 | 0.3422574 | 8.750903 | 0.0030945 | 0.0304194 |
| Endogenous1_hsa-miR-33b-5p|0_MIMAT0003301 | 21.862140 | 0.6661927 | 0.3448169 | 8.714991 | 0.0031560 | 0.0304499 |
| Endogenous1_hsa-miR-155-5p|0_MIMAT0000646 | 30.834213 | -0.4000943 | 0.3877161 | 8.519452 | 0.0035137 | 0.0321097 |
| Endogenous1_hsa-miR-3136-5p|0_MIMAT0015003 | 11.347316 | 0.3925598 | 0.3490501 | 8.457093 | 0.0036362 | 0.0321097 |
| Endogenous1_hsa-miR-922|0_MIMAT0004972 | 17.087206 | 0.1018960 | 0.3583880 | 8.576458 | 0.0034054 | 0.0321097 |
| Endogenous1_hsa-miR-183-5p|0_MIMAT0000261 | 24.081003 | 0.7381933 | 0.3084478 | 8.481537 | 0.0035877 | 0.0321097 |
| Endogenous1_hsa-miR-2113|0_MIMAT0009206 | 10.063128 | 0.5908673 | 0.4090225 | 8.502772 | 0.0035461 | 0.0321097 |
| Endogenous1_hsa-miR-1266-5p|0_MIMAT0005920 | 16.066163 | 0.2193273 | 0.2980987 | 8.297277 | 0.0039705 | 0.0344248 |
| Endogenous1_hsa-miR-512-5p|0_MIMAT0002822 | 16.558368 | 0.4495301 | 0.3449292 | 8.197171 | 0.0041956 | 0.0344248 |
| Endogenous1_hsa-miR-548e-5p|0_MIMAT0026736 | 17.029038 | 0.1914604 | 0.3225284 | 8.201139 | 0.0041864 | 0.0344248 |
| Endogenous1_hsa-miR-187-3p|0_MIMAT0000262 | 16.403869 | 0.4397486 | 0.3330983 | 8.182877 | 0.0042288 | 0.0344248 |
| Endogenous1_hsa-miR-767-5p|0_MIMAT0003882 | 16.911194 | 0.3655126 | 0.3676737 | 8.200679 | 0.0041875 | 0.0344248 |
| Endogenous1_hsa-miR-582-3p|0_MIMAT0004797 | 12.973997 | 0.5421819 | 0.3585611 | 8.124245 | 0.0043677 | 0.0344785 |
| Endogenous1_hsa-miR-449a|0_MIMAT0001541 | 8.114853 | 0.1930454 | 0.4756947 | 8.144047 | 0.0043203 | 0.0344785 |
| Endogenous1_hsa-miR-138-5p|0_MIMAT0000430 | 19.137156 | -0.2817369 | 0.3437245 | 7.696189 | 0.0055338 | 0.0376590 |
| Endogenous1_hsa-miR-423-3p|0_MIMAT0001340 | 27.126050 | -0.3966050 | 0.3138738 | 7.404693 | 0.0065054 | 0.0376590 |
| Endogenous1_hsa-miR-1307-5p|0_MIMAT0022727 | 19.934626 | 0.1644425 | 0.3349698 | 7.411452 | 0.0064810 | 0.0376590 |
| Endogenous1_hsa-miR-34b-3p|0_MIMAT0004676 | 16.321653 | -0.1801052 | 0.3385198 | 7.853464 | 0.0050723 | 0.0376590 |
| Endogenous1_hsa-miR-30b-5p|0_MIMAT0000420 | 25.120072 | -0.5503289 | 0.3473148 | 7.535839 | 0.0060484 | 0.0376590 |
| Endogenous1_hsa-miR-4531|0_MIMAT0019070 | 18.544776 | -0.0995727 | 0.3249445 | 7.571689 | 0.0059292 | 0.0376590 |
| Endogenous1_hsa-miR-671-5p|0_MIMAT0003880 | 14.598270 | -0.5034353 | 0.3805830 | 7.481211 | 0.0062346 | 0.0376590 |
| Endogenous1_hsa-miR-337-5p|0_MIMAT0004695 | 20.455689 | 0.4918635 | 0.3039373 | 7.567834 | 0.0059419 | 0.0376590 |
| Endogenous1_hsa-miR-548a-5p|0_MIMAT0004803 | 27.830500 | 0.6361400 | 0.3825574 | 7.500470 | 0.0061683 | 0.0376590 |
| Endogenous1_hsa-miR-132-3p|0_MIMAT0000426 | 13.169093 | 0.3280749 | 0.4226051 | 7.443532 | 0.0063665 | 0.0376590 |
| Endogenous1_hsa-miR-548k|0_MIMAT0005882 | 14.300429 | 0.3224224 | 0.3819228 | 7.506145 | 0.0061489 | 0.0376590 |
| Endogenous1_hsa-miR-3605-5p|0_MIMAT0017981 | 10.313653 | 0.3250684 | 0.4370139 | 7.422404 | 0.0064417 | 0.0376590 |
| Endogenous1_hsa-miR-517c-3p+hsa-miR-519a-3p|0_MIMAT0002866 | 28.911200 | 0.2973478 | 0.3895462 | 7.485860 | 0.0062185 | 0.0376590 |
| Endogenous1_hsa-miR-660-3p|0_MIMAT0022711 | 19.818373 | 0.2329490 | 0.3830516 | 7.811576 | 0.0051913 | 0.0376590 |
| Endogenous1_hsa-miR-551b-3p|0_MIMAT0003233 | 18.787583 | 0.1702477 | 0.3522732 | 7.649510 | 0.0056788 | 0.0376590 |
| Endogenous1_hsa-miR-607|0_MIMAT0003275 | 20.062353 | 0.3874095 | 0.2965702 | 7.500453 | 0.0061683 | 0.0376590 |
| Endogenous1_hsa-miR-1296-3p|0_MIMAT0026637 | 11.566780 | 0.9848154 | 0.3967151 | 7.647508 | 0.0056851 | 0.0376590 |
| Endogenous1_hsa-miR-199b-5p|0_MIMAT0000263 | 15.288132 | 0.5409972 | 0.3429482 | 7.531401 | 0.0060633 | 0.0376590 |
| Endogenous1_hsa-miR-514a-3p|0_MIMAT0002883 | 20.171994 | 0.4727561 | 0.3208240 | 7.504905 | 0.0061531 | 0.0376590 |
| Endogenous1_hsa-miR-335-5p|0_MIMAT0000765 | 18.744085 | 0.6214970 | 0.3538713 | 7.932888 | 0.0048544 | 0.0376590 |
| Endogenous1_hsa-miR-199a-3p+hsa-miR-199b-3p|0_MIMAT0000232 | 25.579517 | -0.5094305 | 0.4461006 | 7.453202 | 0.0063324 | 0.0376590 |
| Endogenous1_hsa-miR-513a-3p|0_MIMAT0004777 | 23.417255 | 0.1990901 | 0.3463820 | 7.579395 | 0.0059039 | 0.0376590 |
| Endogenous1_hsa-miR-188-3p|0_MIMAT0004613 | 10.702219 | 0.6061570 | 0.3834415 | 7.447019 | 0.0063542 | 0.0376590 |
| Endogenous1_hsa-miR-2116-5p|0_MIMAT0011160 | 8.801659 | 0.6553138 | 0.3957661 | 7.592225 | 0.0058621 | 0.0376590 |
| Endogenous1_hsa-miR-1245a|0_MIMAT0005897 | 14.623967 | 0.2202017 | 0.3881365 | 7.377185 | 0.0066056 | 0.0378192 |
| Endogenous1_hsa-miR-542-3p|0_MIMAT0003389 | 16.732171 | 0.5297783 | 0.3753294 | 7.323801 | 0.0068047 | 0.0381211 |
| Endogenous1_hsa-miR-378d|0_MIMAT0018926 | 20.172643 | 0.6257212 | 0.3385224 | 7.343099 | 0.0067321 | 0.0381211 |
| Endogenous1_hsa-let-7a-5p|0.006_MIMAT0000062 | 15.413694 | 0.1996921 | 2.3777768 | 7.213052 | 0.0072375 | 0.0399564 |
| Endogenous1_hsa-miR-518e-3p|0_MIMAT0002861 | 9.115780 | 0.3259874 | 0.4636558 | 7.201142 | 0.0072857 | 0.0399564 |
| Endogenous1_hsa-miR-628-3p|0_MIMAT0003297 | 12.523421 | -0.4142201 | 0.3921215 | 7.141566 | 0.0075317 | 0.0401134 |
| Endogenous1_hsa-miR-30e-3p|0_MIMAT0000693 | 17.391121 | 0.5948340 | 0.3273287 | 7.138333 | 0.0075453 | 0.0401134 |
| Endogenous1_hsa-miR-495-5p|0_MIMAT0022924 | 18.221403 | 0.5994959 | 0.3307664 | 7.147518 | 0.0075068 | 0.0401134 |
| Endogenous1_hsa-miR-564|0_MIMAT0003228 | 25.316992 | -0.2625442 | 0.3523026 | 6.905362 | 0.0085938 | 0.0434694 |
| Endogenous1_hsa-miR-1226-3p|0_MIMAT0005577 | 16.580422 | 0.4567798 | 0.3272328 | 6.905968 | 0.0085909 | 0.0434694 |
| Endogenous1_hsa-miR-515-3p|0_MIMAT0002827 | 24.116855 | -0.1794401 | 0.2959484 | 6.946050 | 0.0084005 | 0.0434694 |
| Endogenous1_hsa-miR-21-5p|0_MIMAT0000076 | 24.101615 | -1.4589421 | 0.3837023 | 6.934158 | 0.0084565 | 0.0434694 |
| Endogenous1_hsa-miR-345-3p|0_MIMAT0022698 | 17.603977 | 0.5760853 | 0.3806965 | 6.914550 | 0.0085497 | 0.0434694 |
| Endogenous1_hsa-miR-587|0_MIMAT0003253 | 21.431597 | -0.1268609 | 0.3514207 | 6.867602 | 0.0087772 | 0.0439706 |
| Endogenous1_hsa-miR-514b-5p|0_MIMAT0015087 | 26.961500 | 1.0057010 | 0.2828945 | 6.758917 | 0.0093280 | 0.0462848 |
| Endogenous1_hsa-miR-216b-5p|0_MIMAT0004959 | 22.347780 | 0.0793256 | 0.3483973 | 6.694134 | 0.0096731 | 0.0470997 |
| Endogenous1_hsa-miR-4787-5p|0_MIMAT0019956 | 17.134283 | -0.4047326 | 0.3634773 | 6.710449 | 0.0095850 | 0.0470997 |
| Endogenous1_hsa-miR-519d-3p|0_MIMAT0002853 | 17.130403 | -0.3224137 | 0.3316804 | 6.628411 | 0.0100365 | 0.0484167 |
| Endogenous1_hsa-miR-656-3p|0_MIMAT0003332 | 20.600164 | -0.2671215 | 0.3247474 | 6.555701 | 0.0104549 | 0.0497452 |
| Endogenous1_hsa-miR-510-3p|0_MIMAT0026613 | 13.758661 | 0.9270255 | 0.3527670 | 6.543322 | 0.0105279 | 0.0497452 |
| Endogenous1_hsa-miR-939-5p|0_MIMAT0004982 | 20.253974 | 0.5625802 | 0.3394075 | 6.531459 | 0.0105983 | 0.0497452 |
| Endogenous1_hsa-miR-1185-5p|0_MIMAT0005798 | 16.192813 | 0.8719359 | 0.3760341 | 6.509214 | 0.0107317 | 0.0499215 |
Here we can see the choice of column is what separates out the samples, it is going to be hard to call differences because the column type is confounded with the sample type for the most part. We only have one observation of non-zymo columns for the OSA and SS samples.
dds = DESeqDataSetFromMatrix(counts, colData=metadata, design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
vst = varianceStabilizingTransformation(dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
pca_loadings = function(object, ntop=500) {
rv <- matrixStats::rowVars(assay(object))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
length(rv)))]
pca <- prcomp(t(assay(object)[select, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
names(percentVar) = colnames(pca$x)
pca$percentVar = percentVar
return(pca)}
pc = pca_loadings(vst)
comps = data.frame(pc$x)
comps$Name = rownames(comps)
library(dplyr)
comps = comps %>% left_join(metadata, by=c("Name"="id"))
pca_plot = function(comps, nc1, nc2, colorby) {
c1str = paste0("PC", nc1)
c2str = paste0("PC", nc2)
ggplot(comps, aes_string(c1str, c2str, color=colorby)) +
geom_point() + theme_bw() +
xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), "% variance")) +
ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 100), "% variance"))
}
pca_plot(comps, 1, 2, "condition")
pca_plot(comps, 1, 2, "column")
It isn’t clear what is separating out the samples on the lower left.
write_results = function(res, fname) {
res = data.frame(res) %>% tibble::rownames_to_column() %>%
arrange(pvalue)
write.table(res, file=fname, quote=FALSE, col.names=TRUE,
row.names=FALSE, sep=",")
}
dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(dds)
res = results(dds, contrast=c("condition", "OSA", "HC"))
plotMA(res)
write_results(res, "OSA-vs-HC.csv")
res = results(dds, contrast=c("condition", "OSA", "SS"))
plotMA(res)
write_results(res, "OSA-vs-SS.csv")
res = results(dds, contrast=c("condition", "SS", "HC"))
plotMA(res)
write_results(res, "SS-vs-HC.csv")
The 5-11-16 samples are all Zymo columns, all have spike-ins and are two conditions. This is the cleanest set to look at. We’ll subset the data down to these samples and run the analysis. Since these are all spike-ins we can also normalize by the spike in data. We’ll renormalize the ligation data as well.
scaled = correct_miRNA(eset)
counts = as.data.frame(scaled)
colnames(counts) = metadata$id
maymeta = subset(metadata, Date.Shipped == "5.11.16")
maycounts = counts[, maymeta$id]
maymeta$lignorm = lignorm(maycounts)
Here we will calculate normalization values using the formula suggested by the NanoString folks with the spikeins. We calculate the geometric of all of the spike ins and divide that by the average of the geometric mean
colGeo = function(mat) {
lmat = log2(mat + 1)
return(colMeans(lmat))
}
spikenormalize = function(counts) {
counts = counts[is_spikein(rownames(counts)),]
geo = colGeo(counts)
return(mean(geo) / geo)
}
maymeta$spikenorm = spikenormalize(maycounts)
deseq2resids = function(dds) {
fitted = t(t(assays(dds)[["mu"]]) / sizeFactors(dds))
return(counts(dds, normalized=TRUE) - fitted)
}
design = ~lignorm+condition
maycounts = drop_unusable(maycounts)
dds = DESeqDataSetFromMatrix(maycounts, colData=maymeta, design=design)
## converting counts to integer mode
## factor levels were dropped which had no samples
dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
maymeta$sizefactor = sizeFactors(dds)
resdnorm = results(dds, addMLE=TRUE)
DESeq2 has a similar normalization scheme to the spike-ins but instead uses all of the genes, not just the spike-ins. For each gene a scaling factor is created and the median of those scaling factors for each sample is the sizeFactor. How does this compare to just using the spike-ins?
ggplot(maymeta, aes(sizefactor, spikenorm)) +
geom_point() +
xlab("DESeq2 normalization") +
ylab("Spike-in normalization")
Not great! We’ll manually set the size factor for these libraries, then.
dds = DESeqDataSetFromMatrix(maycounts, colData=maymeta, design=design)
## converting counts to integer mode
## factor levels were dropped which had no samples
sizeFactors(dds) = maymeta$spikenorm
dds = DESeq(dds, fitType='local')
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ressnorm = results(dds, addMLE=TRUE)
How did that affect the results? It doesn’t seem to affect them much, plotting the p-values and log2 fold changes against each other:
qplot(resdnorm$pvalue, ressnorm$pvalue) + geom_point() +
xlab("DESeq2 normalization p-value") + ylab("Spike-in normalization p-value")
## Warning: Removed 19 rows containing missing values (geom_point).
## Warning: Removed 19 rows containing missing values (geom_point).
qplot(resdnorm$log2FoldChange, ressnorm$log2FoldChange) + geom_point() +
xlab("DESeq2 normalization log2FC") + ylab("Spike-in normalization log2FC")
## Warning: Removed 19 rows containing missing values (geom_point).
## Warning: Removed 19 rows containing missing values (geom_point).
But, overall, the size factors were within 10% of each other for all of the samples, so it isn’t surprising that shifting them doesn’t affect the results much. We’ll use the results from the NanoString normalization though.
res = results(dds, contrast=c("condition", "OSA", "SS"), addMLE=TRUE)
plotMA(res)
write_results(res, "may-OSA-vs-SS.csv")
Again, we’re not seeing much here. Nothing pops out as significant; I think to do this we need many more samples.
Even looking at some of the higher order PCA components, we’re not seeing any convincing separation.
vst = varianceStabilizingTransformation(dds)
pc = pca_loadings(vst)
comps = data.frame(pc$x)
comps$Name = rownames(comps)
comps = comps %>% left_join(maymeta, by=c("Name"="id"))
pca_plot(comps, 1, 2, "condition")
pca_plot(comps, 3, 4, "condition")
pca_plot(comps, 5, 6, "condition")
With more samples, and more metadata about the samples, we could maybe dive into what might be causing the increased variation, but without that we’re mostly sunk.